{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch as t\n",
    "import pandas as pd\n",
    "\n",
    "# Misc\n",
    "from typing import Callable\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "class GreyWolfOptimiser:\n",
    "    \"\"\" Creates an optimiser for feature selection in a large dataset \"\"\"\n",
    "\n",
    "    def __init__(self, error_function : Callable[[t.Tensor], float], \n",
    "                 population_size : int, num_dimensions : int, num_iterations : int, \n",
    "                 mean : float = 1.0, c_err : float = 1.0, c_size : float = 1.0, a : float = 2.0) -> None:\n",
    "        \n",
    "        # This is the bottleneck. It evaluates how good a particular subset of features is. \n",
    "        # Function specs: input = t.Tensor (dim: num_dimensions). return = float (0-1).\n",
    "        self.err = error_function\n",
    "        \n",
    "        # number of wolves (min 3, raise for larger feature spaces)\n",
    "        self.pop = population_size\n",
    "        \n",
    "        # number of features to select from \n",
    "        self.dims = num_dimensions \n",
    "        \n",
    "        # how many times to test solutions (raise for larger feature spaces)  \n",
    "        self.iters = num_iterations\n",
    "        \n",
    "        # generalised mean exponent (1.0 = arithmetic mean, ~0.0+ = geometric)\n",
    "        self.mean = mean\n",
    "        \n",
    "        # how much to prioritise error function minimisation vs. size of feature set\n",
    "        self.c_err = c_err\n",
    "        self.c_size = c_size\n",
    "        \n",
    "        # how much wolves explore (increase for more exploration)\n",
    "        self.a = a\n",
    "\n",
    "    def init_wolves(self) -> t.Tensor:\n",
    "        \"\"\" Returns a population of wolves with random 1/0s per feature to include/exclude \"\"\"\n",
    "        return t.randint(2, (self.pop, self.dims))\n",
    "        \n",
    "    def fitness(self, wolves : t.Tensor) -> t.Tensor:\n",
    "        \"\"\" Returns error of each wolf in the population (lower is better) \"\"\"\n",
    "        return t.Tensor([\n",
    "            self.c_err * self.err(w) + self.c_size * (t.sum(w) / self.dims) for w in wolves\n",
    "            ])\n",
    "\n",
    "    def compare_wolves(self, wolves : t.Tensor) -> list:\n",
    "        \"\"\" Returns [positions, fitness, followers] of alpha, beta, delta wolves \"\"\"\n",
    "        f = self.fitness(wolves)\n",
    "        sort_f = t.argsort(f)        \n",
    "        sorted_wolves = wolves[sort_f]\n",
    "        return sorted_wolves[:3], f[sort_f][:3], sorted_wolves[3:]\n",
    "    \n",
    "    def next_positions(self, wolves : t.Tensor, top : t.Tensor, \n",
    "                       curr_iter : int) -> t.Tensor:\n",
    "        \"\"\" \n",
    "        For each wolf, find possible next positions given alpha, beta, delta \n",
    "        \n",
    "        Parameters\n",
    "        ----------------\n",
    "        wolves (type: t.Tensor, dim: (pop_size, feature_dims)): \n",
    "            positions of all wolves\n",
    "        top (type: t.Tensor, dim: (3, feature_dims)): \n",
    "            positions of alpha, beta, delta wolves\n",
    "        curr_iter (type: int): \n",
    "            current iteration number\n",
    "\n",
    "        Returns\n",
    "        ----------------\n",
    "        next_positions (type: np.ndarray, dim: (pop_size, 3, feature_dims)): \n",
    "            three possible next positions for each wolf given alpha, beta, delta\n",
    "        \"\"\"\n",
    "        \n",
    "        # Update current a vector\n",
    "        a_curr = self.a * (1 - curr_iter / self.iters)\n",
    "        # Generate random vectors\n",
    "        r1 = t.rand((1, self.dims))\n",
    "        r2 = t.rand((1, self.dims))\n",
    "\n",
    "        # Compute alpha, beta, delta distances\n",
    "        d_alpha = t.abs(2 * r1 * top[0:1, :] - wolves)\n",
    "        d_alpha = top[0:1, :] - (2 * a_curr * r2 - a_curr) * d_alpha\n",
    "        d_beta = t.abs(2 * r1 * top[1:2, :] - wolves)\n",
    "        d_beta = top[1:2, :] - (2 * a_curr * r2 - a_curr) * d_beta\n",
    "        d_delta = t.abs(2 * r1 * top[2:3, :] - wolves) \n",
    "        d_delta = top[2:3, :] - (2 * a_curr * r2 - a_curr) * d_delta\n",
    "        \n",
    "        # Compute next positions for followers \n",
    "        wolves = (d_alpha + d_beta + d_delta)**self.mean\n",
    "        wolves = (wolves/3) ** (1 / self.mean)\n",
    "        wolves = t.clamp(t.round(wolves), 0, 1)\n",
    "        \n",
    "        # Preserve existing top solutions\n",
    "        return t.cat( (top, wolves) ) \n",
    "    \n",
    "    def optimise(self) -> list:\n",
    "        \"\"\" Returns the best wolf and error history after optimisation \"\"\"\n",
    "\n",
    "        wolves = self.init_wolves()\n",
    "        err_history = []\n",
    "\n",
    "        for i in range(self.iters):\n",
    "            print(f'Iteration: {i}/{self.iters}')\n",
    "            top, err, followers = self.compare_wolves(wolves)\n",
    "            err_history.append(err[0])\n",
    "            wolves = self.next_positions(followers, top, i)\n",
    "\n",
    "        return self.compare_wolves(wolves)[0][0], err_history"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = pd.read_csv('../Datasets/reduced_II.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = data['anxiety_meter']\n",
    "X = data.drop(['P_Id', 'anxiety_meter'], axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Handling missing values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.impute import SimpleImputer\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "imputer = SimpleImputer(strategy='mean')\n",
    "X = imputer.fit_transform(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
